# Script to gather and clean information about each generating unit 
library(pacman)
p_load(
  here, fst, data.table, readxl, magrittr, janitor, lutz, tidyr
)

# Starting with table of plants/units we have to match 
cems_unit_dt = 
  read.fst(
    here("Data/electricity-generation/cems/raw_cems_2019.fst"),
    as.data.table = TRUE
  )[,.(
    year = 2019,
    namepcap_99 = quantile(gload_mw,0.99, na.rm = TRUE),
    first_op_date = min(op_date, na.rm = TRUE),
    last_op_date = max(op_date, na.rm = TRUE),
    tot_gload = sum(gload_mw, na.rm = TRUE),
    var_gload = var(gload_mw, na.rm = TRUE),
    tot_rows = .N
    ),
    keyby = .(orispl_code, unitid)
  ][# Removing units with no actual production
    tot_gload > 0
  ]

# Getting units open in 2018 and 2020
cems_units_2018_dt = 
  read.fst(
    here("Data/electricity-generation/cems/raw_cems_2018.fst"),
    as.data.table = TRUE
  )[,
    .(last_op_date_18 = max(op_date, na.rm = TRUE)),
    keyby = .(orispl_code, unitid)
  ]
cems_units_2020_dt = 
  read.fst(
    here("Data/electricity-generation/cems/raw_cems_2020.fst"),
    as.data.table = TRUE
  )[,
    .(first_op_date_20 = min(op_date, na.rm = TRUE)),
    keyby = .(orispl_code, unitid)
  ]

cems_unit_dt = 
  merge(
    cems_unit_dt, 
    cems_units_2018_dt,
    by = c('orispl_code','unitid'),
    all.x = TRUE
  ) |>
  merge(
    cems_units_2020_dt,
    by = c('orispl_code','unitid'),
    all.x = TRUE
  )
# Setting the open status
cems_unit_dt[,
  open_status := fcase(
    is.na(last_op_date_18) & first_op_date > '01-01-2019', 'opened during 2019',
    is.na(first_op_date_20) & last_op_date < '12-31-2019', 'closed during 2019',
    default = 'fully operational'
  )
]
# Calculating percentages by open status
cems_unit_dt[,.(
  num_units = .N,
  pct_gload = sum(tot_gload)/sum(cems_unit_dt$tot_gload)),
  by = open_status
]

# Crosswalk from EPA to EIA 
epa_eia_xwalk_dt = fread(
  here("Data/electricity-generation/plant-info/epa_eia_crosswalk.csv")
) |> clean_names()


# eGRID PLant data for unit -> balancing authority, and lat/lon
egrid_plant_dt = 
  rbind(
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2018_data_v2.xlsx"),
      sheet = "PLNT18",
      skip = 1
    ) |> 
    clean_names() |> 
    data.table() %>% 
    .[,.(
      orispl_code = orispl,
      ba_code_egrid = bacode, 
      nerc_egrid = nerc,
      egrid_subregion = subrgn,
      year = 2018,
      fipsst, fipscnty,
      lat_egrid = lat, lon_egrid = lon
    )],
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2019_data.xlsx"),
      sheet = "PLNT19",
      skip = 1
    ) |> 
      clean_names() |> 
      data.table() %>% 
      .[,.(
        orispl_code = orispl,
        ba_code_egrid = bacode, 
        nerc_egrid = nerc,
        egrid_subregion = subrgn,
        year = 2019,
        fipsst, fipscnty,
        lat_egrid = lat, lon_egrid = lon
      )],
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2020_data.xlsx"),
      sheet = "PLNT20",
      skip = 1
    ) |> 
      clean_names() |> 
      data.table() %>% 
      .[,.(
        orispl_code = orispl,
        ba_code_egrid = bacode, 
        nerc_egrid = nerc,
        egrid_subregion = subrgn,
        year = 2020,
        fipsst, fipscnty,
        lat_egrid = lat, lon_egrid = lon
      )]
  )

# eGRID Generator ID's (has namepcap)
egrid_gen_dt = 
  rbind(
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2018_data_v2.xlsx"),
      sheet = "GEN18",
      skip = 1
    ) |> 
      clean_names() |> 
      data.table() %>% 
      .[,.(
        orispl_code = orispl,
        genid,
        year,
        namepcap_egrid = namepcap
      )],
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2019_data.xlsx"),
      sheet = "GEN19",
      skip = 1
    ) |> 
      clean_names() |> 
      data.table() %>% 
      .[,.(
        orispl_code = orispl,
        genid,
        year,
        namepcap_egrid = namepcap
      )],
    read_xlsx(
      here("Data/electricity-generation/egrid/egrid2020_data.xlsx"),
      sheet = "GEN20",
      skip = 1
    ) |> 
      clean_names() |> 
      data.table() %>% 
      .[,.(
        orispl_code = orispl,
        genid,
        year,
        namepcap_egrid = namepcap
      )]
  )

# PlantInfo for nameplate capacity
plant_info_dt = fread(
  here("Data/electricity-generation/plant-info/PlantInfo(2005-2019).csv")
)[,.(
  year = `Op Year`,
  orispl_code = `Oris Code`,
  unitid = `Unit Id`,
  namepcap_p = `Nameplate Capacity`, 
  namepcap_units_p = `Nameplate Capacity UOM`,
  lat_p = `Latitude`, lon_p = `Longitude`
)]

# Stack heights 
stack_height_dt = 
  fread(
    here("Data/electricity-generation/plant-info/StackHeightInfo(20191202).csv")
  )[,.(
    orispl_code = `ORIS CODE`,
    unitid = `LOCATION IDENTIFIER`,
    stack_height = `STACK HEIGHT`
  )] %>%  
  unique() %>% 
  separate_rows(unitid, sep = ',') %>% 
  data.table() %>% 
  .[,.(
    stack_height = median(stack_height)),
    keyby = .(orispl_code, unitid)
  ]

plant_stack_height_dt = 
  stack_height_dt[,
    .(stack_height_orispl = median(stack_height)),
    keyby = orispl_code
  ]


# EIA 860 form 
eia860_plant_dt = 
  rbind(
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602018/2___Plant_Y2018.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2018, 
        eia_plant_id = `Plant Code`,
        nerc_eia = `NERC Region`,
        ba_code_eia = `Balancing Authority Code`,
        lat_eia = ifelse(`Plant Code` == "62262", 42.89903, Latitude),
        lon_eia = Longitude
      )],
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602019/2___Plant_Y2019.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2019, 
        eia_plant_id = `Plant Code`,
        nerc_eia = `NERC Region`,
        ba_code_eia = `Balancing Authority Code`,
        lat_eia = ifelse(`Plant Code` == "62262", 42.89903, Latitude),
        lon_eia = Longitude
      )],
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602020/2___Plant_Y2020.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2020, 
        eia_plant_id = `Plant Code`,
        nerc_eia = `NERC Region`,
        ba_code_eia = `Balancing Authority Code`,
        lat_eia = ifelse(`Plant Code` == "62262", 42.89903, Latitude),
        lon_eia = Longitude
      )]
  )

eia860_gen_dt = 
  rbind(
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602018/3_1_Generator_Y2018.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2018, 
        eia_plant_id = `Plant Code`,
        eia_generator_id = `Generator ID`,
        namepcap_eia = `Nameplate Capacity (MW)`
      )],
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602019/3_1_Generator_Y2019.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2019, 
        eia_plant_id = `Plant Code`,
        eia_generator_id = `Generator ID`,
        namepcap_eia = `Nameplate Capacity (MW)`
      )],
    read_xlsx(
      here("Data/electricity-generation/eia860/eia8602020/3_1_Generator_Y2020.xlsx"),
      skip = 1
    ) |>
      data.table() %>% 
      .[,.(
        year = 2020, 
        eia_plant_id = `Plant Code`,
        eia_generator_id = `Generator ID`,
        namepcap_eia = `Nameplate Capacity (MW)`
      )]
  )


# Merging together
unit_info_dt_raw = 
  merge(
    cems_unit_dt,  
    epa_eia_xwalk_dt[,.(
      lat_camd = median(camd_latitude, na.rm = TRUE), 
      lat_eiax = median(eia_latitude, na.rm = TRUE), 
      lon_camd = median(camd_longitude, na.rm = TRUE), 
      lon_eiax = median(eia_longitude, na.rm = TRUE),
      namepcap_eia = sum(eia_nameplate_capacity),
      namepcap_camd = sum(camd_nameplate_capacity)), 
      keyby = .(orispl_code = camd_plant_id, unitid = camd_unit_id)
    ],
    by = c('orispl_code','unitid'),
    all.x = TRUE
  ) |>
  merge(
    egrid_plant_dt,
    by = c("orispl_code",'year'),
    all.x = TRUE
  ) |>
  merge(
    eia860_plant_dt,
    by.x = c("orispl_code",'year'),
    by.y = c("eia_plant_id",'year'),
    all.x = TRUE
  ) |>
  merge(
    plant_info_dt, 
    by = c("orispl_code",'unitid',"year"),
    all.x = TRUE
  ) |> 
  merge(
    stack_height_dt,
    by = c("orispl_code",'unitid'),
    all.x = TRUE
  )|> 
  merge(
    plant_stack_height_dt,
    by = c("orispl_code"),
    all.x = TRUE
  )


# Collapsing all of the data into one column
unit_info_dt = 
  unit_info_dt_raw[,.(
    orispl_code, unitid, year, 
    fipsst,
    fipscnty,
    lat = fcase(
      !is.na(lat_camd), lat_camd,
      !is.na(lat_egrid), lat_egrid,
      !is.na(lat_p), lat_p,
      !is.na(lat_eiax), lat_eiax,
      !is.na(lat_eia), lat_eia
    ),
    lon = fcase(
      !is.na(lon_camd), lon_camd,
      !is.na(lon_egrid), lon_egrid,
      !is.na(lon_p), lon_p,
      !is.na(lon_eiax), lon_eiax,
      !is.na(lon_eia), lon_eia
    ),
    namepcap = fcase(
      !is.na(namepcap_p), as.numeric(namepcap_p),
      !is.na(namepcap_eia), namepcap_eia,
      !is.na(namepcap_camd), namepcap_camd,
      !is.na(namepcap_99), namepcap_99
    ),
    ba_code = fcase(
      !is.na(ba_code_egrid),ba_code_egrid,
      !is.na(ba_code_eia),ba_code_eia
    ),
    nerc = fcase(
      !is.na(nerc_egrid),nerc_egrid,
      !is.na(nerc_eia),nerc_eia
    ),
    egrid_subregion,
    stack_height = fcase(
      !is.na(stack_height), as.numeric(stack_height),
      !is.na(stack_height_orispl), stack_height_orispl,
      default = stack_height_dt[,.N,keyby=stack_height][which.max(N)]$stack_height |> as.numeric()
    ),
    open_status,
    tot_gload, 
    var_gload,
    tot_rows
  )][,
    tz := tz_lookup_coords(lat = lat, lon = lon, method = "accurate")
  ]

# Figuring out what county each plant is in 
p_load(tigris, purrr, sf)
options(tigris_use_cache = TRUE)

county_sf = map_dfr(
  unique(unit_info_dt[!is.na(fipsst)]$fipsst), 
  counties, 
  cb = TRUE,
  year = 2019
)

unit_county_dt = 
  st_as_sf(
    unit_info_dt[!is.na(lat)],
    coords = c('lon','lat'),
    crs = 4326
  ) %>%  
  st_transform(crs = st_crs(county_sf)) %>% 
  st_join(county_sf, left = TRUE) %>% 
  data.table() %>% 
  .[,.(
    orispl_code, unitid, year, 
    fips = fcase(
      !is.na(STATEFP) & !is.na(COUNTYFP), paste0(STATEFP, COUNTYFP),
      !is.na(fipsst) & !is.na(fipscnty), paste0(fipsst, fipscnty)
    )
  )]

# Assigning BA Codes to nerc regions 
ba_nerc_dt = unit_info_dt[!is.na(ba_code),.N,keyby = .(ba_code, nerc)]
# Crosswalk between BA Codes and nerc adj 
ba_nerc_xwalk = 
  ba_nerc_dt[
    ba_nerc_dt[,.(max_index = .I[N == max(N)]), by = ba_code]$max_index, .(
      ba_code, 
      nerc_adj = fcase(
        ba_code == "MISO", "MRO",
        ba_code == "PJM", "RFC",
        ba_code %in% c("BANC","CISO","IID","LDWP","TIDC"), "CAL",
        !is.na(nerc), nerc
      )
  )] |> # A few extras we have to add manually
  rbind(data.table(
    ba_code = c(
      'CPLW','YAD','HST','NSB','SEPA',
      "CHPD", "DOPD", "GCPD", "GRID","GWA","SCL","TPWR","WAUW","WWA",'HGMA',
      'GLHB'
    ),
    nerc_adj =c(
      rep('SERC',5),
      rep('WECC',10),
      rep('MRO',1)
    )
  ))
# Saving results for later use
fwrite(
  ba_nerc_xwalk,
  file = here('Data/electricity-generation/ba-nerc-xwalk.csv')
)


# Adding adjusted nerc regions back into the data and counties
unit_info_dt = 
  merge(
    unit_info_dt[,-c("nerc_adj",'fips','fipsst','fipscnty')],
    ba_nerc_xwalk, 
    by = 'ba_code',
    all.x = TRUE
  ) |>
  merge(
    unit_county_dt,
    by = c('orispl_code','unitid','year'),
    all.x = TRUE
  )

# Checking missing 
unit_info_dt[is.na(nerc_adj),.N]

# FOR MISSING NERC ADJ: Figuring out what zip code each plant is in 
zip_sf = zctas(cb = TRUE)
unit_zip_dt = 
  st_as_sf(
    unit_info_dt[!is.na(lat)],
    coords = c('lon','lat'),
    crs = 4326
  ) %>%  
  st_transform(crs = st_crs(zip_sf)) %>% 
  st_join(zip_sf, left = TRUE) %>% 
  data.table() %>% 
  .[,.(
    orispl_code, unitid, year, 
    zip = GEOID20
  )]

# Loading Power Profiler Zip to eGRID subregion crosswalk 
zip_egrid_dt = 
  read_xlsx(
    here('Data/electricity-generation/egrid/power_profiler_zipcode_tool_2019.xlsx'),
    sheet = 'Zip-subregion'
  ) |> 
  data.table() |> 
  clean_names() %>% 
  .[,.(zip = zip_character, egrid_subregion= e_grid_subregion_number_1)]
 
# Table matching egrid subregions to NERC regions 
egrid_nerc_dt = 
  unit_info_dt[
    !is.na(egrid_subregion),
    .N,
    keyby = .(egrid_subregion, nerc_adj)
  ][, # Picking nerc_adj with most units in it for each subregion
    .SD[which.max(N)], 
    keyby = egrid_subregion
  ]

nerc_adj_zip_dt =
  merge(
    unit_zip_dt,
    zip_egrid_dt,
    by = 'zip',
    all.x = TRUE
  ) |>
  merge(
    egrid_nerc_dt,
    by = 'egrid_subregion',
    all.x = TRUE
  ) %>% 
  .[,.(orispl_code, unitid, nerc_adj_zip = nerc_adj)]

# Adding zip code matched nerc_adj back into the data 
unit_info_dt = 
  merge(
    unit_info_dt,
    nerc_adj_zip_dt,
    by = c('orispl_code','unitid'),
    all.x = TRUE
  )[,':='(
    nerc_adj = ifelse(is.na(nerc_adj), nerc_adj_zip, nerc_adj),
    nerc_adj_zip = NULL
  )]

# Making clean unit names to put in file names
unit_info_dt[,clean_unitid := make_clean_names(unitid)]

# Saving the results
write.fst(
  x = unit_info_dt,
  path = here("Data/electricity-generation/unit-info-dt.fst")
)

